### Replication code for "Misdemeanor Disenfranchisement?"
### October 2018
### See the readme file for more details on what goes where in this replication package
### Contact Ariel White with questions: arwhi@mit.edu

rm(list=ls())
setwd("/home/ariel/Dropbox (MIT)/Texas/Harris_fullsentencing")

library(foreign)

test2 <- read.csv("Harrisfull_nov14.csv", stringsAsFactors=F, header=F)
top <- read.csv("harrisco_colnames.csv")
colnames(test2) <- top$x

test2$fyear <- substr(test2$fda, 0, 4)
library(lubridate)
test2$filedate<-  ymd(test2$fda)

library(data.table)
harris <- data.table(test2)
rm(test2)

##########################################################################
#sort within def_spn by date/case number
#determine what each person's earliest case number is and drop all later cases for that person.
# so keep rows that are:
# 1. the first case filed
# 2. filed the same day as the first case filed, or
# 3. the same case # as the first case filed.

#first generate a unique SPN for few people who don't have them (looks like this is early records)
sum(is.na(harris$def_spn)==T)
setkey(harris, def_spn, def_nam)
harris[is.na(def_spn), def_spn := 1:.N, by="def_nam"]

setkey(harris, def_spn, filedate)
require(zoo)
harris[, caseorder:= 1:.N, by= list(def_spn)] 
harris[caseorder==1, firstcase:=1]
harris[firstcase == 1, firstcasedate := fda]
harris[firstcase == 1, firstcaseno := cas]
harris[, firstcasedate:= na.locf(firstcasedate), by=def_spn]
harris[, firstcaseno:= na.locf(firstcaseno), by=def_spn]
firsttime <- harris[cas == firstcaseno | fda == firstcasedate, ]

dim(harris); dim(firsttime)
length(unique(firsttime$def_spn)) 
setkey(firsttime, "def_spn", "fda")

#drop any observations where a prior probation agreement was revoked or terminated--this all happens after first appearance.
firsttime2 <- firsttime[!(disposition %like% "TERMINATED") & !(disposition %like% "PROBATION REVOKED"), ]
dim(firsttime2); dim(firsttime)

firsttime <- firsttime2 

##########################################################################
# subset to time period (will use subsets of this for different analyses)
firsttime <- firsttime[fyear %in% c("2000", "2001","2002","2003","2004","2005","2006","2007","2008", "2009", "2010", "2011", "2012","2013","2014"), ]
#note that this same script was used (with different years subbed in here and a different filename up top) to clean up the newly-collected 2012-2016 placebo data used in SI section 7 as well. 

rm(harris)
firsttime[, jail:=0]
#clean up case outcomes
firsttime[sentence %like% "HCJ" & is.na(sentence)==F, jail:=1]
firsttime[sentence %like% "CONFINEMENT" & is.na(sentence)==F, jail:=1]
firsttime[sentence %like% "JAIL" & is.na(sentence)==F, jail:=1]
firsttime[sentence %like% "TDC" & is.na(sentence)==F, jail:=1]

firsttime[, fine:=0]
firsttime[sentence %like% "FINE" & is.na(sentence)==F, fine:=1]
library(stringr)
firsttime[, fineamount:= str_extract(sentence, "\\$([0-9])*")]
firsttime[, finenum := as.numeric(gsub("\\$", "", fineamount))]

firsttime[, probation:=0]
firsttime[sentence %like% "PROBATION" & is.na(sentence)==F, probation:=1]

firsttime[probation ==1, probationchunk :=grep(" PROBATION", unlist(strsplit(sentence, ", ")), value=T) ]
firsttime[probation ==1, probation1 := gsub(" PROBATION", "", probationchunk)]
firsttime$probationtime <- as.numeric(lapply(strsplit(firsttime$probation1, " "), "[", 1))
firsttime$probationtimeunit <- as.character(lapply(strsplit(firsttime$probation1, " "), "[", 2))

firsttime[probationtimeunit %like% "DAY", probationtimeadj := probationtime]
firsttime[probationtimeunit %like% "YEAR", probationtimeadj := 365*probationtime] #difftime doesn't recognize these as units, so give them something.
firsttime[probationtimeunit %like% "MONTH", probationtimeadj := 30*probationtime]

class(firsttime$probationtimeadj)
firsttime[is.na(probationtimeadj) ==F, probationlength:= as.difftime(probationtimeadj, units="days")]
firsttime[is.na(probationlength)==F, probationdays := as.numeric(probationlength, units="days")]

#drop some extraneous stuff
firsttime[, probationchunk := NULL]
firsttime[, probation1 := NULL]
firsttime[, probationtime := NULL]
firsttime[, probationtimeunit := NULL]
firsttime[, probationtimeadj := NULL]

firsttime$jail1 <- lapply(strsplit(firsttime$sentence, " HCJ"), "[", 1)
firsttime$jail2 <- lapply(strsplit(firsttime$sentence, " CONFINEMENT"), "[", 1)
firsttime$jail3 <- lapply(strsplit(firsttime$sentence, " STATE JAIL"), "[", 1)
firsttime$jail4 <- lapply(strsplit(firsttime$sentence, " TDC"), "[", 1)

firsttime[sentence %like% "HCJ" & is.na(sentence)==F, jail1a := unlist(jail1)]
firsttime[sentence %like% "CONFINEMENT" & is.na(sentence)==F, jail2a := unlist(jail2)]
firsttime[sentence %like% "STATE JAIL" & is.na(sentence)==F, jail3a := unlist(jail3)]
firsttime[sentence %like% "TDC" & is.na(sentence)==F, jail4a := unlist(jail4)]

firsttime[is.na(jail1a)==F, jailtest := jail1a]
firsttime[is.na(jail2a)==F, jailtest := jail2a]
firsttime[is.na(jail3a)==F, jailtest := jail3a]
firsttime[is.na(jail4a)==F, jailtest := jail4a]
firsttime$jailtime <- as.numeric(lapply(strsplit(firsttime$jailtest, " "), "[", 1))
firsttime$jailtimeunit <- lapply(strsplit(firsttime$jailtest, " "), "[", 2)

firsttime[jailtimeunit %like% "DAY", jailtimeadj := jailtime]
firsttime[jailtimeunit %like% "YEAR", jailtimeadj := 365*jailtime] 
firsttime[jailtimeunit %like% "MONTH", jailtimeadj := 30*jailtime]

firsttime[is.na(jailtimeadj) ==F, sentencelength:= as.difftime(jailtimeadj, units="days")]
firsttime[is.na(sentencelength)==F, sentencedays := as.numeric(sentencelength, units="days")]

firsttime[, jail1:=NULL]
firsttime[, jail1a:=NULL]
firsttime[, jail2:=NULL]
firsttime[, jail2a:=NULL]
firsttime[, jail3:=NULL]
firsttime[, jail3a:=NULL]
firsttime[, jail4:=NULL]
firsttime[, jail4a:=NULL]
firsttime[, jailtest:=NULL]
firsttime[, jailtimeadj:=NULL]
firsttime[, jailtimeunit:=NULL] 

firsttime[, nonconv:= 0]
firsttime[disposition %like% "ACQUITTAL", nonconv:= 1]
firsttime[disposition %like% "DISMISSED", nonconv:= 1]
firsttime[disposition %like% "DISM OTHER", nonconv:= 1]

firsttime[, birthdate := ymd(def_dob)]
firsttime[, def_yob := substr(def_dob, 0, 4)]
firsttime[, ageatfile := filedate-birthdate]

dim(firsttime); length(unique(firsttime$def_spn))
#collapse multiple charges from same date into one observation.  (They'll have gone through the same court assignment mechanism)
#sum jail/probation time, fines; mark most severe charge class, take minimum of binary vars (jail, fine, probation)

firsttime[com_l_d == "F1", chargetype:=1]
firsttime[com_l_d == "F2", chargetype:=2]
firsttime[com_l_d == "F3", chargetype:=3]
firsttime[com_l_d == "FS", chargetype:=4]
firsttime[com_l_d == "MA", chargetype:=5]
firsttime[com_l_d == "MB", chargetype:=6]
firsttime[com_l_d == "MC", chargetype:=7]
firsttime[com_l_d == "M", chargetype:=8]

firsttime[,casesnum := 1]
#also (for people with only one case), pull through the actual charge
firsttime[firstcase == 1,curr_off_lit_1 := curr_off_lit]
firsttime[firstcase == 1,com_off_lit_1 := com_off_lit]
 
setkey(firsttime, "def_spn", "chargetype") 

collapsedfirsttime <- firsttime[, list(totalsentencedays = sum(sentencedays, na.rm=T), 
	anyjail = max(jail), 
	anyfine = max(fine), 
	anyprobation = max(probation), 
	totalprobationdays = sum(probationdays, na.rm=T),
	totalfineamt = sum(finenum, na.rm=T),

	anyconv = 1-min(nonconv), 
	mostsevcharge = min(chargetype),
	numcases = sum(casesnum),
	curr_off_lit_1 = max(curr_off_lit_1),
	com_off_lit_1 = max(com_off_lit_1),
	disposition = max(disposition),
	def_nam = sample(def_nam,1) 
), by=c("def_spn","fyear", "firstcasedate", "crt", "def_stnum", "def_stnam", "def_cty",  "def_st", "def_zip","def_sex" , "def_yob", "def_rac",  "def_dob", "ageatfile", "firstcasedate")]
dim(collapsedfirsttime); length(unique(firsttime$def_spn)) #should be the same.


######################################################################
#now go grab other cases for the people in this set, and figure out how many got rearrested/felony convictions, etc.
head(collapsedfirsttime); dim(collapsedfirsttime)
#for each SPN, go through 2008-2012 cases and find all of their post-first time cases
#how many cases (felony/misd)?  How much jail/prison?  Any future felony convictions?

#first, make the list of all 2008-2012 cases.
test2 <- read.csv("Harrisfull_nov14.csv", stringsAsFactors=F, header=F) 
head(test2); tail(test2)
top <- read.csv("/home/ariel/Dropbox (MIT)/Texas/HarrisCounty_sentencing/harrisco_colnames.csv")
colnames(test2) <- top$x
test2$fyear <- substr(test2$fda, 0, 4)
library(lubridate)
test2$filedate<-  ymd(test2$fda)
allcases <- data.table(test2)
allcases <- allcases[fyear %in% c("2008", "2009", "2010", "2011", "2012"), ] #really care about future cases for these main years, not others. 

allcases[, jail:=0]
allcases[sentence %like% "HCJ" & is.na(sentence)==F, jail:=1]
allcases[sentence %like% "CONFINEMENT" & is.na(sentence)==F, jail:=1]
allcases[sentence %like% "JAIL" & is.na(sentence)==F, jail:=1]
allcases[sentence %like% "TDC" & is.na(sentence)==F, jail:=1]
allcases[, nonconv:= 0]
allcases[disposition %like% "ACQUITTAL", nonconv:= 1]
allcases[disposition %like% "DISMISSED", nonconv:= 1]
allcases[disposition %like% "DISM OTHER", nonconv:= 1]

personlevels <- as.data.frame(matrix(nrow=nrow(collapsedfirsttime), ncol=5))
for (i in 1:nrow(collapsedfirsttime)){ 
	spn <- collapsedfirsttime$def_spn[i] #grab values
	firstcase <- ymd(collapsedfirsttime$firstcasedate[i])
	subcases <- allcases[def_spn==spn,] #go find matches
	#now keep just the ones AFTER the first case and before the 2012 election.
	subcases1 <- subcases[(filedate > (firstcase +days(7))) & (filedate < "2012-11-06"), ]
	#make dummies for type of case
	subcases1[,felony := 0]; subcases1[,misdemeanor := 0]
	subcases1[substr(curr_l_d, 1, 1) == "F",felony := 1]; 
	subcases1[substr(curr_l_d, 1, 1) == "M",misdemeanor := 1]; 
	setkey(subcases1, def_spn, cas)
	subcases2 <- unique(subcases1) #drop dupes from probation termination, etc.
	subcases2[,felonyconviction := 0]
	subcases2[felony==1 & nonconv==0, felonyconviction := 1]; 
	personlevel <- subcases2[,list(felonies = sum(felony), misdemeanors = sum(misdemeanor), felonyconvictions = sum(felonyconviction), jailsentences = sum(jail)), by=c("def_spn")]
	if(nrow(personlevel)==1){personlevels[i,] <- personlevel}
	if(nrow(personlevel)==0){personlevels[i,] <-c(spn, 0,0,0,0)}
}

#now merge this into the main collapsed dataset (note it won't really have anything for the 2012-2014 placebo data, but it'll keep it consistent)
dim(collapsedfirsttime) ; dim(personlevels)
colnames(personlevels) <- c("def_spn", "felonies", "misdemeanors", "felonyconvictions", "jailsentences")
collapsedfirsttime <- cbind(collapsedfirsttime, personlevels[,c(2:5)])
dim(collapsedfirsttime)

save(collapsedfirsttime, file="harrisfirsttime200014_withfutureanddisps.Rdata")




